# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory
import os
print(os.listdir("../input"))
# Any results you write to the current directory are saved as output.
from tqdm import tqdm
from statsmodels.graphics.gofplots import qqplot
ls '../input/input_data/'
3 folders are provided, each for drive, trip and weather data; these folders have multiple parquet files which will need to be contacted togther to yield 1 consolidated dataframe for each sourceIn this analysis, we will focus on Drive features' eda analysis### get list of all the files for drive
driveF = [i for i in os.listdir('../input/input_data/drive/') if 'parquet' in i]
print("Total {} partial files for drive features".format(len(driveF)))
### lets build one dataframe each for trip, drive, and weather
Path = '../input/input_data/'
def consolidateFiles(sourceType:str,iterfiles:list):
print(" ------ Consolidating for {} ------- ".format(sourceType))
outDF = pd.DataFrame()
for f_ in tqdm(iterfiles):
outDF = pd.concat([outDF,pd.read_parquet(os.path.join(Path,'{}/{}'.format(sourceType,f_)))],0)
return outDF
driveDF = consolidateFiles('drive',driveF)
print("All Drive files read & consolidated")
### Quick look into columns
driveDF.head(2)
Univariate analysisMultivariate analysis### divide into contin and categorical cols
numericCol = driveDF.select_dtypes(float).columns
others = [i for i in driveDF.columns if i not in numericCol]
import seaborn as sns
import matplotlib.pyplot as plt
for n_ in numericCol:
sns.distplot([driveDF[n_]])
plt.title("Distribution for {}".format(n_))
plt.show()
### Also plot boxplots for outliers
for n_ in numericCol:
sns.boxplot([driveDF[n_]])
plt.title("Boxplot for {}".format(n_))
plt.show()
Velocity is normally distributed with a mean($\mu$) ~ 64and variance($\sigma^2$) ~ 175Accel_X,Accel_Y,Accel_Z interestingly all have a bi-modal distribution comprised of almost 2 normal distributionsEngine coolant temp is also bi-modal distributionEngine load is approximately normally distributed wih ($\mu$) ~ 204and variance($\sigma^2$) ~ 100Fuel level and rpm do not look like any parametric distributionVelocity and Engine load have a lot of extreme values (>1.5 IQR and < -1.5 IQR)### Univariate Analysis for accel_X
sns.distplot(driveDF['accel_x'])
There are 2 broad behaviours depicted in the above:
- While driving within city limits and person will only accelarate closer to 50 ish (~ mean of left Gaussian)
- While outside of city, say onto highways travelling will be slightly higher accelaration (mean ~ 80 for right Gaussain)
- In terms of feature engg, it might be interesting to also provide info about whether the data is from left or right Gaussian; perhaps even encoding the respective sides' mean in some form might help
### also check if the behavior is consistent for all three directional acceleration elements
### Univariate Analysis for accel_X
sns.distplot(driveDF['accel_x'])
sns.distplot(driveDF['accel_y'])
sns.distplot(driveDF['accel_z'])
plt.plot()
;
sns.distplot(driveDF['eng_load'])
print("Mean: {0:.1f} Variance: {0:.1f}".format(driveDF['eng_load'].mean(),driveDF['eng_load'].var()))
### confirm normality by QQ plots
qqplot(driveDF['eng_load'])
from scipy.stats import shapiro
stat, p = shapiro(driveDF['eng_load'])
print('Statistics=%.3f, p=%.3f' % (stat, p))
# interpret
alpha = 0.05
if p > alpha:
print('Sample looks Gaussian (fail to reject H0)')
else:
print('Sample does not look Gaussian (reject H0)')
sns.distplot(driveDF['fuel_level'])
print("Mean: {0:.1f} Variance: {0:.1f}".format(driveDF['fuel_level'].mean(),driveDF['fuel_level'].var()))
sns.distplot(driveDF['rpm'])
print("Mean: {0:.1f} Variance: {0:.1f}".format(driveDF['rpm'].mean(),driveDF['rpm'].var()))
others
print("Count of unique vehicles: {}".format(len(driveDF['vehicle_id'].unique())))
plt.figure(figsize=(20,5))
sns.countplot(driveDF['vehicle_id'])
print("Count of unique trips: {}".format(len(driveDF['trip_id'].unique())))
plt.figure(figsize=(20,5))
sns.countplot(driveDF['trip_id'])
sns.distplot(driveDF['trip_id'].value_counts().values,kde=False)
plt.title("Countplot for trips")
for v_ in driveDF['vehicle_id'].unique():
d = driveDF[driveDF['vehicle_id']==v_]
sns.distplot(d['velocity'])
### analysis by top 10 longest and shortest trips
top10shortTrips = driveDF['trip_id'].value_counts()[-10:].index
top10longTrips = driveDF['trip_id'].value_counts()[:10].index
for d_ in top10longTrips:
d = driveDF[driveDF['trip_id']==d_]
sns.distplot(d['velocity'])
for d_ in top10shortTrips:
d = driveDF[driveDF['trip_id']==d_]
sns.distplot(d['velocity'])
### analysis of velocity by hours
uqHour = driveDF['datetime'].dt.hour.unique()
plt.figure(figsize=(20,5))
for h_ in uqHour:
d = driveDF[driveDF['datetime'].dt.hour==h_]
sns.distplot(d['velocity'])
plt.title('Distribution of velocty across hours')
for h_ in np.sort(uqHour):
print(h_,driveDF[driveDF['datetime'].dt.hour==h_]['velocity'].mean(),driveDF[driveDF['datetime'].dt.hour==h_]['velocity'].var())
### velocity X accel
consolAccel = (driveDF['accel_x']**2+driveDF['accel_y']**2+driveDF['accel_z']**2)**(0.5)
print('correlation b/w veloctiy and consolidated acceleration: {:0.2f}%'.format(np.corrcoef(driveDF['velocity'],consolAccel)[1][0]*100))
### correlation b/w velocity,temp, engLoad, fuel, rpm
driveDF[['velocity','engine_coolant_temp','eng_load','fuel_level','rpm']].corr()
for v_ in driveDF['vehicle_id'].unique():
d = driveDF[driveDF['vehicle_id']==v_]
sns.distplot(d['engine_coolant_temp'])
plt.show()
for v_ in driveDF['vehicle_id'].unique():
d = driveDF[driveDF['vehicle_id']==v_]
sns.distplot(d['engine_coolant_temp'])
plt.title("Coolant temp distribution across vehicles")
for d_ in top10longTrips:
d = driveDF[driveDF['trip_id']==d_]
sns.distplot(d['engine_coolant_temp'])
plt.title("Coolant temp distribution across 10 longest trips")
i += 1
for d_ in top10shortTrips:
d = driveDF[driveDF['trip_id']==d_]
sns.distplot(d['engine_coolant_temp'])
plt.title("Coolant temp distribution across 10 shortest trips")
### analysis of velocity by hours
uqHour = np.sort(driveDF['datetime'].dt.hour.unique())
for h_ in uqHour:
d = driveDF[driveDF['datetime'].dt.hour==h_]
plt.figure(figsize=(20,5))
sns.distplot(d['engine_coolant_temp'])
plt.title('Distribution of coolant temp across hour: {}'.format(h_))
plt.show()
### analysis of velocity by hours
uqHour = driveDF['datetime'].dt.hour.unique()
plt.figure(figsize=(20,5))
for h_ in uqHour:
d = driveDF[driveDF['datetime'].dt.hour==h_]
sns.distplot(d['engine_coolant_temp'])
plt.title('Distribution of coolant temp across hours')
for v_ in driveDF['vehicle_id'].unique():
d = driveDF[driveDF['vehicle_id']==v_]
sns.distplot(d['eng_load'])
plt.show()
for v_ in driveDF['vehicle_id'].unique():
d = driveDF[driveDF['vehicle_id']==v_]
sns.distplot(d['eng_load'])
plt.title("Engine Load distribution across vehicles")
for d_ in top10longTrips:
d = driveDF[driveDF['trip_id']==d_]
sns.distplot(d['eng_load'])
plt.title("Engine Load distribution across 10 longest trips")
i += 1
for d_ in top10shortTrips:
d = driveDF[driveDF['trip_id']==d_]
sns.distplot(d['eng_load'])
plt.title("Engine Load distribution across 10 shortest trips")
### analysis of velocity by hours
uqHour = np.sort(driveDF['datetime'].dt.hour.unique())
for h_ in uqHour:
d = driveDF[driveDF['datetime'].dt.hour==h_]
plt.figure(figsize=(20,5))
sns.distplot(d['eng_load'])
plt.title('Distribution of engine load across hour: {}'.format(h_))
plt.show()
### analysis of velocity by hours
uqHour = driveDF['datetime'].dt.hour.unique()
plt.figure(figsize=(20,5))
for h_ in uqHour:
d = driveDF[driveDF['datetime'].dt.hour==h_]
sns.distplot(d['eng_load'])
plt.title('Distribution of engine load across hours')
for v_ in driveDF['vehicle_id'].unique():
d = driveDF[driveDF['vehicle_id']==v_]
sns.distplot(d['rpm'])
plt.show()
for v_ in driveDF['vehicle_id'].unique():
d = driveDF[driveDF['vehicle_id']==v_]
sns.distplot(d['rpm'])
plt.title("RPM distribution across vehicles")
for d_ in top10longTrips:
d = driveDF[driveDF['trip_id']==d_]
sns.distplot(d['rpm'])
plt.title("RPM distribution across 10 longest trips")
for d_ in top10shortTrips:
d = driveDF[driveDF['trip_id']==d_]
sns.distplot(d['rpm'])
plt.title("RPM distribution across 10 shortest trips")
### analysis of velocity by hours
uqHour = np.sort(driveDF['datetime'].dt.hour.unique())
for h_ in uqHour:
d = driveDF[driveDF['datetime'].dt.hour==h_]
plt.figure(figsize=(20,5))
sns.distplot(d['rpm'])
plt.title('Distribution of RPM across hour: {}'.format(h_))
plt.show()
### analysis of velocity by hours
uqHour = driveDF['datetime'].dt.hour.unique()
plt.figure(figsize=(20,5))
for h_ in uqHour:
d = driveDF[driveDF['datetime'].dt.hour==h_]
sns.distplot(d['rpm'])
plt.title('Distribution of RPM across hours')